In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import scipy as sp
import statsmodels
In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import OneClassSVM
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from scipy.stats import chi2
from sklearn.covariance import MinCovDet
from scipy.spatial import distance
In [3]:
import plotly.express as px
import pandas as pd
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# to get the connection
init_notebook_mode(connected = True)
# plotly also serves online,
# but we are using just a sample
cf.go_offline()

Mega Sheet¶

In [4]:
mega = pd.read_excel("Mega.xlsx")
In [5]:
mega.head()
Out[5]:
followers_count deliverable_price_in_dollars spot_compensation_in_dollars unit_id type
0 520902 5125.0 2500 52950 TikTok
1 542879 2050.0 1000 50327 TikTok
2 543636 27675.0 13500 53414 TikTok
3 543636 27675.0 13500 52197 TikTok
4 543636 27675.0 13500 52196 TikTok
In [6]:
mega.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               15 non-null     int64  
 1   deliverable_price_in_dollars  15 non-null     float64
 2   spot_compensation_in_dollars  15 non-null     int64  
 3   unit_id                       15 non-null     int64  
 4   type                          15 non-null     object 
dtypes: float64(1), int64(3), object(1)
memory usage: 728.0+ bytes
In [7]:
x = mega["followers_count"]
y = mega["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=15

In [8]:
x = mega["followers_count"]
y = mega["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=15

In [9]:
mega.describe().T
Out[9]:
count mean std min 25% 50% 75% max
followers_count 15.0 662100.533333 162168.245100 520902.0 543636.0 561262.0 732866.5 956500.0
deliverable_price_in_dollars 15.0 11112.166667 10325.153184 2050.0 3792.5 6500.0 18125.0 27675.0
spot_compensation_in_dollars 15.0 6176.666667 5971.565560 1000.0 1850.0 4000.0 9750.0 20000.0
unit_id 15.0 49249.000000 3907.544571 40310.0 48138.0 50327.0 52043.0 53414.0
In [10]:
sns.pairplot(mega)
Out[10]:
<seaborn.axisgrid.PairGrid at 0x1264e7557c0>

Treatment for outliers with percentile statistic¶

In [11]:
#Outlier Treatment

def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

mega = outlier_detect(mega)
mega.drop("type", axis=1, inplace=True)
In [12]:
mega.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               15 non-null     int64  
 1   deliverable_price_in_dollars  15 non-null     float64
 2   spot_compensation_in_dollars  15 non-null     int64  
 3   unit_id                       15 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 608.0 bytes
In [13]:
fig = px.scatter(mega, x="followers_count", y="deliverable_price_in_dollars",
           
           color="followers_count",
           title="Scatter Plot followers_count vs deliverable_price_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [14]:
fig = px.scatter(mega, x="followers_count", y="spot_compensation_in_dollars",
           hover_data=["unit_id"],
           color="followers_count",
           title="Scatter Plot followers_count vs spot_compensation_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [15]:
sns.set_style("darkgrid")
sns.heatmap(mega.corr(), annot=True)
Out[15]:
<AxesSubplot:>

Question:¶

I am trying to find if followers impact deliverable and spot price - follow to spot price & followers to deliverable price.¶

I am trying to find if followers impact deliverable and spot price?¶

1. followers_count to deliverable_price_in_dollars¶

In [16]:
X = mega['followers_count']
y = mega["deliverable_price_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [17]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 8699.667
Isolation Forest R2: 0.077
RMSE: 95822692.653
In [18]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [19]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [20]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 8699.667
Minimum Covariance Determinant R2: 0.077
RMSE: 9788.907
In [21]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [22]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [23]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 8662.764
Local Outlier Factor R2: 0.125
RMSE: 9826.529
In [24]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [25]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [26]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 9635.948
One-Class SVM R2: 0.233
RMSE: 11021.574
In [27]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [28]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [33]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 21761.839
Polynomial R2: 0.845
RMSE: 26934.130

2. followers_count to spot_compensation_in_dollars price¶

In [46]:
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [47]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028
Isolation Forest R2: 0.071
RMSE: 49659849.050
In [48]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [49]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [50]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028
Minimum Covariance Determinant R2: 0.071
RMSE: 7046.974
In [51]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [52]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [53]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151
Local Outlier Factor R2: 0.122
RMSE: 7054.052
In [54]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [55]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [56]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853
One-Class SVM R2: 0.233
RMSE: 7171.741
In [57]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [58]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [59]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696
Polynomial R2: 0.848
RMSE: 17154.534

Conclusion¶

Followers don't have linear relationship with deliverable_price and spot_compensation_price. Followers don't impact deliverable price¶

Conclusion¶

While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives -7 R2 Score which means these variables are not highly correlated in predicting the both price colunn attributes(deliverable_price and spot price). They have no relation for this this

Micro Sheet¶

In [60]:
micro = pd.read_excel("micro.xlsx")
In [61]:
micro.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               109 non-null    int64  
 1   deliverable_price_in_dollars  109 non-null    float64
 2   spot_compensation_in_dollars  109 non-null    float64
 3   unit_id                       109 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 3.5 KB
In [62]:
x = micro["followers_count"]
y = micro["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [63]:
x = micro["followers_count"]
y = micro["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [64]:
micro.describe().T
Out[64]:
count mean std min 25% 50% 75% max
followers_count 109.0 25394.633028 10844.477611 10036.0 16061.0 24008.0 34088.0 48892.0
deliverable_price_in_dollars 109.0 5141.442294 6171.000726 102.5 1947.5 2562.5 5070.0 26650.0
spot_compensation_in_dollars 109.0 2521.110826 3013.192166 50.0 950.0 1250.0 2500.0 13000.0
unit_id 109.0 49666.779817 3405.725817 39878.0 47757.0 50909.0 52073.0 54998.0
In [65]:
sns.pairplot(micro)
Out[65]:
<seaborn.axisgrid.PairGrid at 0x126535bde20>

Treatment for outliers with percentile statistic¶

In [66]:
#Outlier Treatment

def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

micro = outlier_detect(micro)
#micro.drop("type", axis=1, inplace=True)
In [67]:
micro.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               109 non-null    int64  
 1   deliverable_price_in_dollars  109 non-null    float64
 2   spot_compensation_in_dollars  109 non-null    float64
 3   unit_id                       109 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 3.5 KB
In [68]:
fig = px.scatter(micro, x="followers_count", y="deliverable_price_in_dollars",
           
           color="followers_count",
           title="Scatter Plot followers_count vs deliverable_price_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [69]:
fig = px.scatter(micro, x="followers_count", y="spot_compensation_in_dollars",
           hover_data=["unit_id"],
           color="followers_count",
           title="Scatter Plot followers_count vs spot_compensation_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [70]:
sns.set_style("darkgrid")
sns.heatmap(micro.corr(), annot=True)
Out[70]:
<AxesSubplot:>

Question:¶

I am trying to find if followers impact deliverable and spot price - follow to spot price & followers to deliverable price.¶

I am trying to find if followers impact deliverable and spot price?¶

1. followers_count to deliverable_price_in_dollars¶

In [71]:
X = micro['followers_count']
y = micro["deliverable_price_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(73, 1) (73,)

Isolation Forest¶

In [72]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 1330.270
Isolation Forest R2: 0.012
RMSE: 2895660.631
In [73]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [74]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Minimum Covariance Determinant¶

In [75]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 1375.852
Minimum Covariance Determinant R2: 0.046
RMSE: 1767.391
In [76]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [77]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Local Outlier Factor¶

In [78]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 1373.723
Local Outlier Factor R2: 0.041
RMSE: 1763.470
In [79]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [80]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

One-Class SVM¶

In [81]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 1380.030
One-Class SVM R2: 0.071
RMSE: 1777.913
In [82]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [83]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Applying polynomial regression on best outlier model¶

In [92]:
poly = PolynomialFeatures(degree=4)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 1494.071
Polynomial R2: 0.214
RMSE: 2016.141

2. followers_count to spot_compensation_in_dollars price¶

In [93]:
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [94]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028
Isolation Forest R2: 0.071
RMSE: 49659849.050
In [95]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [96]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [97]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028
Minimum Covariance Determinant R2: 0.071
RMSE: 7046.974
In [98]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [99]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [100]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151
Local Outlier Factor R2: 0.122
RMSE: 7054.052
In [101]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [102]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [103]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853
One-Class SVM R2: 0.233
RMSE: 7171.741
In [104]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [105]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [109]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696
Polynomial R2: 0.848
RMSE: 17154.534

Conclusion¶

Followers don't have linear relationship with deliverable_price and spot_compensation_price. Followers don't impact deliverable price¶

Mid Sheet¶

In [110]:
mid = pd.read_excel("mid.xlsx")
In [111]:
mid.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               88 non-null     int64  
 1   deliverable_price_in_dollars  88 non-null     float64
 2   spot_compensation_in_dollars  88 non-null     float64
 3   unit_id                       88 non-null     int64  
dtypes: float64(2), int64(2)
memory usage: 2.9 KB
In [112]:
x = mid["followers_count"]
y = mid["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [113]:
x = mid["followers_count"]
y = mid["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [114]:
mid.describe().T
Out[114]:
count mean std min 25% 50% 75% max
followers_count 88.0 109674.488636 43514.781055 50660.0 66922.000 101134.5 138673.0 198560.0
deliverable_price_in_dollars 88.0 6128.681932 5427.416630 410.0 2946.875 4715.0 8200.0 41000.0
spot_compensation_in_dollars 88.0 3012.882500 2647.292315 200.0 1437.500 2625.0 4000.0 20000.0
unit_id 88.0 49246.329545 3459.655845 39967.0 47266.000 49445.0 51733.5 55045.0
In [115]:
sns.pairplot(mid)
Out[115]:
<seaborn.axisgrid.PairGrid at 0x126514f6d90>

Treatment for outliers with percentile statistic¶

In [116]:
#Outlier Treatment

def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

mid = outlier_detect(mid)
#mid.drop("type", axis=1, inplace=True)
In [117]:
fig = px.scatter(mid, x="followers_count", y="deliverable_price_in_dollars",
           
           color="followers_count",
           title="Scatter Plot followers_count vs deliverable_price_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [118]:
fig = px.scatter(mid, x="followers_count", y="spot_compensation_in_dollars",
           hover_data=["unit_id"],
           color="followers_count",
           title="Scatter Plot followers_count vs spot_compensation_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [119]:
sns.set_style("darkgrid")
sns.heatmap(mid.corr(), annot=True)
Out[119]:
<AxesSubplot:>

Question:¶

I am trying to find if followers impact deliverable and spot price - follow to spot price & followers to deliverable price.¶

I am trying to find if followers impact deliverable and spot price?¶

1. followers_count to deliverable_price_in_dollars¶

In [120]:
X = mid['followers_count']
y = mid["deliverable_price_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(58, 1) (58,)

Isolation Forest¶

In [121]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 2312.041
Isolation Forest R2: 0.002
RMSE: 7308077.099
In [122]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [123]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Minimum Covariance Determinant¶

In [124]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 2285.355
Minimum Covariance Determinant R2: 0.009
RMSE: 2692.046
In [125]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [126]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Local Outlier Factor¶

In [127]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 2293.635
Local Outlier Factor R2: 0.006
RMSE: 2693.350
In [128]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [129]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

One-Class SVM¶

In [130]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 2310.145
One-Class SVM R2: 0.004
RMSE: 2707.950
In [131]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [132]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Applying polynomial regression on best outlier model¶

In [141]:
poly = PolynomialFeatures(degree=3)
X_train3_poly = poly.fit_transform(X_train3)
X_train3_poly
model = LinearRegression()
train_y_ = model.fit(X_train3_poly, y_train3)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train3_poly, y_train3))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 2487.203
Polynomial R2: 0.085
RMSE: 2936.645

2. followers_count to spot_compensation_in_dollars price¶

In [142]:
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [143]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028
Isolation Forest R2: 0.071
RMSE: 49659849.050
In [144]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [145]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [146]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028
Minimum Covariance Determinant R2: 0.071
RMSE: 7046.974
In [147]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [148]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [149]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151
Local Outlier Factor R2: 0.122
RMSE: 7054.052
In [150]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [151]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [152]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853
One-Class SVM R2: 0.233
RMSE: 7171.741
In [153]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [154]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [157]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696
Polynomial R2: 0.848
RMSE: 17154.534

Macro Sheet¶

In [158]:
macro = pd.read_excel("macro.xlsx")
In [159]:
macro.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               50 non-null     int64  
 1   deliverable_price_in_dollars  50 non-null     float64
 2   spot_compensation_in_dollars  50 non-null     int64  
 3   unit_id                       50 non-null     int64  
dtypes: float64(1), int64(3)
memory usage: 1.7 KB
In [160]:
x = macro["followers_count"]
y = macro["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [161]:
x = macro["followers_count"]
y = macro["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [162]:
macro.describe().T
Out[162]:
count mean std min 25% 50% 75% max
followers_count 50.0 309281.920 95771.230089 205129.0 233578.25 276740.0 382384.00 495068.0
deliverable_price_in_dollars 50.0 7036.935 4601.449859 615.0 3536.25 6150.0 9225.00 20500.0
spot_compensation_in_dollars 50.0 3860.700 2969.729942 300.0 2000.00 3000.0 4500.00 15000.0
unit_id 50.0 48575.600 3787.040803 39881.0 45904.25 49804.0 51545.25 53236.0
In [163]:
sns.pairplot(macro)
Out[163]:
<seaborn.axisgrid.PairGrid at 0x12654764400>

Treatment for outliers with percentile statistic¶

In [164]:
#Outlier Treatment

def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

macro = outlier_detect(macro)
#macro.drop("type", axis=1, inplace=True)
In [165]:
fig = px.scatter(macro, x="followers_count", y="deliverable_price_in_dollars",
           
           color="followers_count",
           title="Scatter Plot followers_count vs deliverable_price_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [166]:
fig = px.scatter(macro, x="followers_count", y="spot_compensation_in_dollars",
           hover_data=["unit_id"],
           color="followers_count",
           title="Scatter Plot followers_count vs spot_compensation_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [167]:
sns.set_style("darkgrid")
sns.heatmap(macro.corr(), annot=True)
Out[167]:
<AxesSubplot:>

Question:¶

I am trying to find if followers impact deliverable and spot price - follow to spot price & followers to deliverable price.¶

I am trying to find if followers impact deliverable and spot price?¶

1. followers_count to deliverable_price_in_dollars¶

In [168]:
X = macro['followers_count']
y = macro["deliverable_price_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(33, 1) (33,)

Isolation Forest¶

In [169]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 3065.037
Isolation Forest R2: 0.046
RMSE: 15518326.744
In [170]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [171]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Minimum Covariance Determinant¶

In [172]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 3057.157
Minimum Covariance Determinant R2: 0.035
RMSE: 3932.082
In [173]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [174]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Local Outlier Factor¶

In [175]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 3101.212
Local Outlier Factor R2: 0.097
RMSE: 3962.171
In [176]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [177]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

One-Class SVM¶

In [178]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 3088.410
One-Class SVM R2: 0.063
RMSE: 3960.937
In [179]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [180]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Applying polynomial regression on best outlier model¶

In [205]:
poly = PolynomialFeatures(degree=10)
X_train3_poly = poly.fit_transform(X_train3)
X_train3_poly
model = LinearRegression()
train_y_ = model.fit(X_train3_poly, y_train3)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train3_poly, y_train3))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 2967.230
Polynomial R2: 0.164
RMSE: 3790.096

2. followers_count to spot_compensation_in_dollars price¶

In [206]:
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [207]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028
Isolation Forest R2: 0.071
RMSE: 49659849.050
In [208]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [209]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [210]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028
Minimum Covariance Determinant R2: 0.071
RMSE: 7046.974
In [211]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [212]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [213]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151
Local Outlier Factor R2: 0.122
RMSE: 7054.052
In [214]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [215]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [216]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853
One-Class SVM R2: 0.233
RMSE: 7171.741
In [217]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [218]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [219]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696
Polynomial R2: 0.848
RMSE: 17154.534

Conclusion¶

While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives 80% score which means these variables are highly correlated in predicting the both price colunn attributes(deliverable_price and spot price)

Nano Sheet¶

In [220]:
nano = pd.read_excel("nano.xlsx")
In [221]:
nano.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               59 non-null     int64  
 1   deliverable_price_in_dollars  59 non-null     float64
 2   spot_compensation_in_dollars  59 non-null     int64  
 3   unit_id                       59 non-null     int64  
 4   type                          59 non-null     object 
dtypes: float64(1), int64(3), object(1)
memory usage: 2.4+ KB
In [222]:
x = nano["followers_count"]
y = nano["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [223]:
x = nano["followers_count"]
y = nano["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
In [224]:
nano.describe().T
Out[224]:
count mean std min 25% 50% 75% max
followers_count 59.0 3646.000000 2699.682178 11.0 1507.00 2808.0 5790.5 9362.0
deliverable_price_in_dollars 59.0 4049.203390 5240.300565 410.0 1383.75 2050.0 4100.0 21525.0
spot_compensation_in_dollars 59.0 2062.033898 2737.102375 200.0 675.00 1000.0 2000.0 10500.0
unit_id 59.0 49536.745763 3827.341157 40501.0 48059.00 51096.0 52418.5 53699.0
In [225]:
sns.pairplot(nano)
Out[225]:
<seaborn.axisgrid.PairGrid at 0x12653278d90>

Treatment for outliers with percentile statistic¶

In [226]:
#Outlier Treatment

def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

nano = outlier_detect(nano)
nano.drop("type", axis=1, inplace=True)
In [227]:
fig = px.scatter(nano, x="followers_count", y="deliverable_price_in_dollars",
           
           color="followers_count",
           title="Scatter Plot followers_count vs deliverable_price_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [228]:
fig = px.scatter(nano, x="followers_count", y="spot_compensation_in_dollars",
           hover_data=["unit_id"],
           color="followers_count",
           title="Scatter Plot followers_count vs spot_compensation_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [229]:
sns.set_style("darkgrid")
sns.heatmap(nano.corr(), annot=True)
Out[229]:
<AxesSubplot:>

Question:¶

I am trying to find if followers impact deliverable and spot price - follow to spot price & followers to deliverable price.¶

I am trying to find if followers impact deliverable and spot price?¶

1. followers_count to deliverable_price_in_dollars¶

In [230]:
X = nano['followers_count']
y = nano["deliverable_price_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(39, 1) (39,)

Isolation Forest¶

In [231]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 1219.163
Isolation Forest R2: 0.003
RMSE: 2191916.671
In [232]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [233]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Minimum Covariance Determinant¶

In [234]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 1193.798
Minimum Covariance Determinant R2: 0.042
RMSE: 1440.532
In [235]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [236]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Local Outlier Factor¶

In [237]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 1197.429
Local Outlier Factor R2: 0.059
RMSE: 1440.845
In [238]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [239]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

One-Class SVM¶

In [240]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 1192.820
One-Class SVM R2: 0.043
RMSE: 1438.110
In [241]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [242]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()

Applying polynomial regression on best outlier model¶

In [247]:
poly = PolynomialFeatures(degree=4)
X_train3_poly = poly.fit_transform(X_train3)
X_train3_poly
model = LinearRegression()
train_y_ = model.fit(X_train3_poly, y_train3)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train3_poly, y_train3))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 1199.604
Polynomial R2: 0.173
RMSE: 1480.782

2. followers_count to spot_compensation_in_dollars price¶

In [248]:
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [249]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028
Isolation Forest R2: 0.071
RMSE: 49659849.050
In [250]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [251]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [252]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028
Minimum Covariance Determinant R2: 0.071
RMSE: 7046.974
In [253]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [254]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [255]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151
Local Outlier Factor R2: 0.122
RMSE: 7054.052
In [256]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [257]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [258]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853
One-Class SVM R2: 0.233
RMSE: 7171.741
In [259]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [260]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [261]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696
Polynomial R2: 0.848
RMSE: 17154.534

Conclusion¶

Followers don't have linear relationship with deliverable_price and spot_compensation_price. Followers don't impact deliverable price¶

Conclusion¶

While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives 100% score which means these variables are highly correlated in predicting the both price colunn attributes(deliverable_price and spot price)

Web Sheet¶

In [262]:
web = pd.read_excel("web.xlsx")
In [263]:
web.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   followers_count               11 non-null     int64  
 1   deliverable_price_in_dollars  11 non-null     float64
 2   spot_compensation_in_dollars  11 non-null     float64
 3   unit_id                       11 non-null     int64  
 4   type                          11 non-null     object 
dtypes: float64(2), int64(2), object(1)
memory usage: 568.0+ bytes
In [264]:
x = web["followers_count"]
y = web["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=11

In [265]:
x = web["followers_count"]
y = web["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=11

In [266]:
web.describe().T
Out[266]:
count mean std min 25% 50% 75% max
followers_count 11.0 1.826295e+06 1.235187e+06 1029606.0 1122045.00 1469865.0 1758976.5 5349027.0
deliverable_price_in_dollars 11.0 1.198093e+04 8.923663e+03 0.0 5555.09 10762.5 19400.0 24600.0
spot_compensation_in_dollars 11.0 6.592691e+03 4.902140e+03 0.0 2709.80 5250.0 11500.0 12500.0
unit_id 11.0 4.712045e+04 3.894937e+03 41045.0 43959.00 47911.0 50161.5 52926.0
In [267]:
sns.pairplot(web)
Out[267]:
<seaborn.axisgrid.PairGrid at 0x126567f26a0>

Treatment for outliers with percentile statistic¶

In [268]:
#Outlier Treatment

def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

web = outlier_detect(web)
web.drop("type", axis=1, inplace=True)
In [269]:
fig = px.scatter(web, x="followers_count", y="deliverable_price_in_dollars",
           
           color="followers_count",
           title="Scatter Plot followers_count vs deliverable_price_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [270]:
fig = px.scatter(web, x="followers_count", y="spot_compensation_in_dollars",
           hover_data=["unit_id"],
           color="followers_count",
           title="Scatter Plot followers_count vs spot_compensation_in_dollars",
                 )
           
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
In [271]:
sns.set_style("darkgrid")
sns.heatmap(web.corr(), annot=True)
Out[271]:
<AxesSubplot:>

Question:¶

I am trying to find if followers impact deliverable and spot price - follow to spot price & followers to deliverable price.¶

I am trying to find if followers impact deliverable and spot price?¶

1. followers_count to deliverable_price_in_dollars¶

In [272]:
X = web['followers_count']
y = web["deliverable_price_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(7, 1) (7,)

Isolation Forest¶

In [273]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 7047.335
Isolation Forest R2: 0.104
RMSE: 94301228.701
In [274]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [275]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning:

omni_normtest is not valid with less than 8 observations; 6 samples were given.

Minimum Covariance Determinant¶

In [276]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 7047.335
Minimum Covariance Determinant R2: 0.104
RMSE: 9710.882
In [277]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [278]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning:

omni_normtest is not valid with less than 8 observations; 6 samples were given.

Local Outlier Factor¶

In [279]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 6184.800
Local Outlier Factor R2: 0.005
RMSE: 8075.137
C:\Users\DELL\anaconda3\lib\site-packages\sklearn\neighbors\_lof.py:274: UserWarning:

n_neighbors (10) is greater than the total number of samples (7). n_neighbors will be set to (n_samples - 1) for estimation.

In [280]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [281]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning:

omni_normtest is not valid with less than 8 observations; 6 samples were given.

One-Class SVM¶

In [282]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 11543.009
One-Class SVM R2: 0.420
RMSE: 16265.347
In [283]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
In [284]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning:

omni_normtest is not valid with less than 8 observations; 5 samples were given.

Applying polynomial regression on best outlier model¶

In [285]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 107200.131
Polynomial R2: 0.947
RMSE: 189914.795

2. followers_count to spot_compensation_in_dollars price¶

In [286]:
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test 
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)

Isolation Forest¶

In [287]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028
Isolation Forest R2: 0.071
RMSE: 49659849.050
In [288]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [289]:
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Minimum Covariance Determinant¶

In [290]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028
Minimum Covariance Determinant R2: 0.071
RMSE: 7046.974
In [291]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [292]:
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

Local Outlier Factor¶

In [293]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151
Local Outlier Factor R2: 0.122
RMSE: 7054.052
In [294]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [295]:
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=9

One-Class SVM¶

In [296]:
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853
One-Class SVM R2: 0.233
RMSE: 7171.741
In [297]:
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20) 
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
In [298]:
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x) 
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning:

kurtosistest only valid for n>=20 ... continuing anyway, n=8

Applying polynomial regression on best outlier model¶

In [299]:
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696
Polynomial R2: 0.848
RMSE: 17154.534

Conclusion¶

Followers don't have linear relationship with deliverable_price and spot_compensation_price. Followers don't impact deliverable price¶

Conclusion¶

While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives 77% score which means these variables are highly correlated in predicting the both price colunn attributes(deliverable_price and spot price)